Statefarm Kaggle submission (fast.ai homework3)

What I'll need to do:

  • set up data structure into sample, train, valid, test
  • Import VGG16
  • pop the top layer, train it
  • set all fully connected layers to trainable
  • Improvements:
    • play with dropout parameter
    • add data augmentation
    • stack multiple versions of the classifier
    • apply batch norm
    • have a setup that adjusts learning rate

These are general imports, always make sure to run these


In [1]:
import os
import zipfile
import shutil
import csv
import bcolz
os.environ["KERAS_BACKEND"] = "theano"
import keras
import numpy as np
from keras.utils.data_utils import get_file
from keras.models import load_model
from keras.layers.normalization import BatchNormalization
from keras.layers import Dense, Dropout, Flatten, Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical

model_url = "http://files.fast.ai/models/"
model_name = "vgg16.h5"
cache_dir = "models"


Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is enabled with initial size: 90.0% of memory, cuDNN 5103)
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.
  warnings.warn(warn)

Data structure

First we set up the data structure, with proper:

  • sample (about 10% of the data)
  • train
  • validation (about 15% of the data)
  • test directories in the processed directory

In [ ]:
raw_path = os.path.join(os.getcwd(), os.pardir, 'data', 'raw')
processed_path = os.path.join(os.getcwd(), os.pardir, 'data', 'processed')

# Make directories sample, valid, train, test, first check if this whole step is necessary
if os.path.exists(os.path.join(processed_path, 'sample')):
    print 'Sample directory already exists, no need to do data structuring!'
else:
    os.mkdir(os.path.join(processed_path, 'sample'))
    os.mkdir(os.path.join(processed_path, 'sample', 'train'))
    os.mkdir(os.path.join(processed_path, 'sample', 'valid'))
    os.mkdir(os.path.join(processed_path, 'valid'))
    
    # Extract Kaggle zipfiles to correct path
    print 'Extracting zips, this may take a while...'
    img_zip_handle = zipfile.ZipFile(os.path.join(raw_path, 'imgs.zip'), 'r')
    img_zip_handle.extractall(processed_path)
    img_zip_handle.close()
    
    csv_zip_handle = zipfile.ZipFile(os.path.join(raw_path, 'driver_imgs_list.csv.zip'), 'r')
    csv_zip_handle.extractall(processed_path)
    csv_zip_handle.close()
    print 'Done extracting zips!'
    
    # Set up sample directory structure
    for i in range(10):
        dirname = 'c' + str(i)
        os.mkdir(os.path.join(processed_path, 'sample', 'train', dirname))
        os.mkdir(os.path.join(processed_path, 'sample', 'valid', dirname))
        os.mkdir(os.path.join(processed_path, 'valid', dirname))
        
    os.mkdir(os.path.join(processed_path, 'test', 'unknown'))
    for filename in os.listdir(os.path.join(processed_path, 'test')):
        if filename.endswith('.jpg'):
            src = os.path.join(processed_path, 'test', filename)
            dest = os.path.join(processed_path, 'test', 'unknown', filename)
            shutil.move(src, dest)
        
    data = np.genfromtxt(os.path.join(processed_path, 'driver_imgs_list.csv'), delimiter=',', dtype=None)
    data = data[1:,:]
    drivers = np.unique(data[:,0])
    num_drivers = drivers.shape[0]
    # Throw 15% of train data into sample folder
    sample_drivers_amount = int(np.floor(num_drivers*0.15))
    sample_drivers = np.random.choice(drivers, sample_drivers_amount, replace=False)

    # Throw 20% of train data into valid folder
    validation_drivers_amount = int(np.floor(num_drivers*0.2))
    validation_drivers = np.random.choice(drivers, validation_drivers_amount, replace=False)

    # Set up sample set
    for i in range(sample_drivers_amount):
        driver_name = sample_drivers[i]
        driver_columns = data[data[:,0] == driver_name]
        for j in range(10):
            driver_class = 'c' + str(j)
            dest = os.path.join(processed_path, 'sample', 'train', driver_class)
            class_columns = driver_columns[driver_columns[:,1] == driver_class]
            for filename in class_columns[:,2]:
                src = os.path.join(processed_path, 'train', driver_class, filename)
                shutil.copyfile(src, os.path.join(dest, filename))

    # Now move from sample_train to sample_validation a fraction of ~40%
    sample_drivers_validation_amount = int(np.floor(sample_drivers_amount*0.4))
    sample_drivers_validation = np.random.choice(sample_drivers, 
                                                 sample_drivers_validation_amount, 
                                                 replace=False)

    for i in range(sample_drivers_validation_amount):
        driver_name = sample_drivers_validation[i]
        driver_columns = data[data[:,0] == driver_name]
        for j in range(10):
            driver_class = 'c' + str(j)
            class_columns = driver_columns[driver_columns[:,1] == driver_class]
            for filename in class_columns[:,2]:
                dest = os.path.join(processed_path, 'sample', 'valid', driver_class, filename)
                src = os.path.join(processed_path, 'sample', 'train', driver_class, filename)
                shutil.move(src, dest)

    # Set up validation set
    for i in range(validation_drivers_amount):
        driver_name = validation_drivers[i]
        driver_columns = data[data[:,0] == driver_name]

        for j in range(10):
            driver_class = 'c' + str(j)
            class_columns = driver_columns[driver_columns[:,1] == driver_class]
            for filename in class_columns[:,2]:
                src = os.path.join(processed_path, 'train', driver_class, filename)
                dest = os.path.join(processed_path, 'valid', driver_class, filename)
                shutil.move(src, dest)

VGG16() setup boilerplate


In [ ]:
def add_conv_block(model, layers, filters):
    for i in range(layers):
        model.add(ZeroPadding2D((1,1)))
        model.add(Convolution2D(filters, 3, 3, activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    return model
    
def add_fc_block(model, dropout):
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(dropout))
    return model

In [ ]:
class vgg16():
    def __init__(self, dropout=0.5):
        self.vgg_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32).reshape([3,1,1])
        self.create(dropout)
        
    def create(self, dropout):
        def vgg_preprocess(x, mean):
            mean = np.array(mean)
            x = x - mean
            return x[:,:,::-1]
        
        model = self.model = Sequential()
        
        model.add(Lambda(vgg_preprocess, 
                         input_shape=(3, 244, 244), 
                         output_shape=(3, 244, 244),
                         arguments = {'mean': self.vgg_mean.tolist()}
                        ))
        
        model = add_conv_block(model, 2, 64)
        model = add_conv_block(model, 2, 128)
        model = add_conv_block(model, 3, 256)
        model = add_conv_block(model, 3, 512)
        model = add_conv_block(model, 3, 512)
        
        model.add(Flatten())
        
        model = add_fc_block(model, dropout)
        model = add_fc_block(model, dropout)
        model.add(Dense(1000, activation='softmax'))
        
        model = model.load_weights(get_file(model_name, model_url+model_name, cache_subdir=cache_dir))

Load in data with generators

Here I set up the generators for the training and validation work


In [ ]:
DEBUG = True
data_dir = os.path.join(os.getcwd(), os.pardir, 'data')
model_dir = os.path.join(os.getcwd(), os.pardir, 'models')
if DEBUG == True:
    path = os.path.join(data_dir, 'processed', 'sample')
    batch_size = 4
    epochs = 2
elif DEBUG == False:
    path = os.path.join(data_dir, 'processed')
    batch_size = 64
    epochs = 5

train_path = os.path.join(path, 'train')
val_path = os.path.join(path, 'valid')
train_batches = ImageDataGenerator().flow_from_directory(train_path, 
                                                         target_size=(244,244), 
                                                         batch_size=batch_size, 
                                                         shuffle=True)
val_batches = ImageDataGenerator().flow_from_directory(val_path, 
                                                       target_size=(244,244), 
                                                       batch_size=batch_size, 
                                                       shuffle=True)

Finetuning the model

  • Now the top layer must be popped and replaced with a 10-output, which will correspond to our hot-encoding/softmax output
  • Then retrain model with new dense layer, which will be a good starting point for later fine tuning
  • Save the model, so that we can start toying with it in the next section

In [ ]:
lr = 0.001

model = vgg16(dropout=0.5).model
model.pop()
for layer in model.layers: layer.trainable=False
model.add(Dense(10, activation='softmax'))
model.compile(optimizer=Adam(lr), loss='categorical_crossentropy', metrics=['accuracy'])
model.fit_generator(train_batches, 
                    samples_per_epoch=train_batches.nb_sample, 
                    nb_epoch=epochs, 
                    validation_data=val_batches, 
                    nb_val_samples=val_batches.nb_sample)

model.save(os.path.join(model_dir, 'model_with_new_top.h5'))

New model architecture

Now that we have the trained model, we should probably make all the FC layers trainable. Additionally, we can start playing with:

  • learning rate schedule
  • batchnorm
  • data augmentation
  • setting different epochs
  • some other kind of regularisation?

First, import the model from when we saved it. Then:

  • Separate convolutional layers from fully connected ones
  • Make a new convolutional architecture with whatever we want to implement
  • Put them together
  • Train

In [6]:
old_model = load_model(os.path.join(os.getcwd(), 
                                    os.pardir, 
                                    'models', 
                                    'model_with_new_top.h5'))

Batch normalisation

Let's implement batch normalisation first. It'll speed up our looking for the adequate learning rate. From this link we know that BatchNorm() needs to be applied after the activation.


In [7]:
flatten_index = [index for index,layer in enumerate(old_model.layers) if type(layer).__name__ == 'Flatten'][0]

conv_model_layers = old_model.layers[1:flatten_index-1]
conv_model = Sequential(conv_model_layers)

In [4]:
def fc_model(dropout):
    model = Sequential()

    model.add(MaxPooling2D(input_shape=conv_model.layers[-1].output_shape[1:]))
    model.add(Flatten())
    
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout))
    
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout))
    
    model.add(Dense(10, activation='softmax'))
    return model

Data augmentation

Let's set up new batch generators, this time making use of augmented data. Remember, we only seek to augment our training input, no need to augment validation input (there's no learning taking place). train_batches generator is set to False because we're going to be saving it, and need reproducible inputs.

Some minor debug settings


In [17]:
DEBUG = False
data_dir = os.path.join(os.getcwd(), os.pardir, 'data')
model_dir = os.path.join(os.getcwd(), os.pardir, 'models')
test_path = os.path.join(path, 'test')
if DEBUG == True:
    path = os.path.join(data_dir, 'processed', 'sample')
    batch_size = 4
    epochs = 2
elif DEBUG == False:
    path = os.path.join(data_dir, 'processed')
    batch_size = 64
    epochs = 5

Conv stack output

Using only the convolutional part of VGG16, I generate the predictions, based on some augmented data, and save it to disk.


In [ ]:
train_path = os.path.join(path, 'train')
val_path = os.path.join(path, 'valid')

train_image_gen = ImageDataGenerator(rotation_range=15,
                                     height_shift_range=0.05,
                                     width_shift_range=0.1,
                                     shear_range = 0.1,
                                     channel_shift_range=20,
                                    )

aug_train_batches = train_image_gen.flow_from_directory(train_path, 
                                                    target_size=(244,244), 
                                                    batch_size=batch_size,
                                                    class_mode='categorical',
                                                    shuffle=False)

train_batches = ImageDataGenerator().flow_from_directory(train_path, 
                                                    target_size=(244,244), 
                                                    batch_size=batch_size,
                                                    class_mode='categorical',
                                                    shuffle=False)

val_batches = ImageDataGenerator().flow_from_directory(val_path, 
                                                       target_size=(244,244), 
                                                       batch_size=batch_size, 
                                                       shuffle=False)

print 'Predicting, this may take a while...'
conv_model_predictions_augmented = conv_model.predict_generator(aug_train_batches,
                                                                aug_train_batches.nb_sample*2,
                                                               )
conv_model_predictions = conv_model.predict_generator(train_batches,
                                                      train_batches.nb_sample,
                                                     )
val_predictions = conv_model.predict_generator(val_batches,
                                               val_batches.nb_sample,
                                              )

print 'Done predicting!'
# Concatenating augmented and non-augmented predictions
conv_model_predictions = np.concatenate([conv_model_predictions_augmented, conv_model_predictions])

prediction_labels = to_categorical(train_batches.classes)

prediction_labels = np.concatenate([prediction_labels]*3)

Test convolutions


In [ ]:
test_path = os.path.join(path, 'test')
test_generator = ImageDataGenerator().flow_from_directory(test_path, 
                                                    target_size=(244,244), 
                                                    batch_size=batch_size,
                                                    class_mode='categorical',
                                                    shuffle=False)
print 'Predicting test features, this might take a while...'
conv_model_test_inputs = conv_model.predict_generator(test_generator,
                                                      test_generator.nb_sample
                                                     )
print 'Done predicting!'

In [ ]:
save_array(os.path.join(model_dir, 'test_inputs.bc'), conv_model_test_inputs)

Save everything to disk

Saving everything to disk so I don't need to generate it every time


In [3]:
def save_array(location, array):
    instance = bcolz.carray(array, rootdir=location, mode='w')
    instance.flush()
    
def load_array(location):
    return bcolz.open(location)[:]

In [ ]:
save_array(os.path.join(model_dir, 'conv_predictions.bc'), conv_model_predictions)
save_array(os.path.join(model_dir, 'conv_labels.bc'), prediction_labels)
save_array(os.path.join(model_dir, 'val_predictions.bc'), val_predictions)
save_array(os.path.join(model_dir, 'val_labels.bc'), to_categorical(val_batches.classes))

Train fully connected layers only

Import data from disk


In [ ]:
conv_predictions = load_array(os.path.join(model_dir, 'conv_predictions.bc'))
conv_labels = load_array(os.path.join(model_dir, 'conv_labels.bc'))
conv_val_predictions = load_array(os.path.join(model_dir, 'val_predictions.bc'))
conv_val_labels = load_array(os.path.join(model_dir, 'val_labels.bc'))

Use data to train model


In [ ]:
dropout = 0.8
model = fc_model(dropout)
epochs = 10
lr = 0.0001
model.compile(optimizer=Adam(lr), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.optimizer.lr.set_value(lr)
model.fit(conv_predictions,
          conv_labels,
          batch_size=batch_size,
          nb_epoch=epochs,
          validation_data=(conv_val_predictions, conv_val_labels))

In [ ]:
lr = 0.00001
epochs = 2
model.optimizer.lr.set_value(lr)
model.fit(conv_predictions,
          conv_labels,
          batch_size=batch_size,
          nb_epoch=epochs,
          validation_data=(conv_val_predictions, conv_val_labels))

In [ ]:
model.save_weights(os.path.join(model_dir, 'final_predictor.h5'))

Load weights from trained model, and generate predictions


In [8]:
dropout = 0.8
model = fc_model(dropout)
lr = 0.0001
model.compile(optimizer=Adam(lr), 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.optimizer.lr.set_value(lr)
model.load_weights(os.path.join(model_dir, 'final_predictor.h5'))

In [ ]:
test_input = load_array(os.path.join(model_dir, 'test_inputs.bc'))

In [10]:
test_predictions = model.predict(test_input)

In [14]:
test_predictions[1:3,:]


Out[14]:
array([[  4.33069140e-01,   2.62587871e-02,   8.50916728e-02,
          2.25995868e-01,   2.30034534e-02,   4.61317264e-02,
          1.71671864e-02,   1.53106423e-02,   1.07109718e-01,
          2.08618529e-02],
       [  1.30285183e-03,   5.67891548e-05,   3.76506563e-04,
          1.87800475e-03,   9.90259051e-01,   6.48322923e-04,
          2.67745648e-03,   2.02435971e-04,   8.06084485e-04,
          1.79246720e-03]], dtype=float32)

Convert to proper CSV

Now that I have the predictions, I need to put it all into a proper .csv file that Kaggle will understand, that means:

  • Clip the values so as to not be killed too much by the Kaggle grading algorithm
  • Figuring out the corresponding filenames
  • Adding a proper header

In [15]:
clipped_predictions = np.clip(test_predictions, 0.02, 0.98)

In [21]:
filename_list = [filename for filename in os.listdir(os.path.join(test_path, 'unknown'))]

In [31]:
filename_array = np.transpose(np.array(filename_list, ndmin=2))

In [38]:
csv_headless = np.concatenate([filename_array, clipped_predictions], axis=1)

In [46]:
header_list = [
    'img',
    'c0', 
    'c1',
    'c2',
    'c3',
    'c4',
    'c5',
    'c6',
    'c7',
    'c8',
    'c9',
]
header_line = np.array(header_list, ndmin=2)

In [53]:
ans_array = np.concatenate([header_line, csv_headless])
# ans_array = ans_array.astype('|S10')

In [54]:
np.savetxt(os.path.join(data_dir, "submission.csv"), ans_array, delimiter=',', fmt='%s')

In [55]:
data_dir


Out[55]:
'/home/ubuntu/homework/homework3/notebooks/../data'

In [ ]: